import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Qt5Agg')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pandas_profiling import ProfileReport
from dataprep.eda import create_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import catboost
from catboost import *
import shap
shap.initjs()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
from catboost import Pool, cv
from catboost import CatBoostClassifier, Pool, metrics, cv
folder_name = '../data/'
file_name = '../data/click_data.txt'
data_df = pd.read_csv(file_name, delimiter='\t', parse_dates=['LogEntryTime'])
data_df['day_of_week'] = data_df['LogEntryTime'].dt.day_name()
create_report(data_df).show_browser()
for f in ['DeviceType', 'OSFamily', 'Location']:
_ = data_df.groupby(f)['Click'].mean()
fig = px.bar(_, x="Click", )
fig.update_layout(yaxis={'categoryorder':'total descending'})
fig.show()
_ = data_df.groupby('site')['Click'].sum().sort_values(ascending=False)[:10]
fig = px.bar(_, x="Click", orientation='h')
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
data_df['_site'] = data_df.site
data_df.loc[data_df['_site'].value_counts()[data_df['_site']].values < 2000, '_site'] = "Others"
for f in ['DeviceType', 'OSFamily', 'AdFormat', 'Location', '_site']:
print(f)
g = sns.catplot(data=data_df, x=f, col='Click', kind='count')
plt.show()
DeviceType
OSFamily
AdFormat
Location
_site
#(2019-02-01 to 2019-02-07)
_ = data_df.set_index('LogEntryTime').loc['2019-02-01':'2019-02-07']
_.groupby('day_of_week')['WinningPriceCPMInBucks'].mean()
day_of_week Saturday 3.597775 Name: WinningPriceCPMInBucks, dtype: float64
cat_features = ['DeviceType', 'OSFamily', 'AdFormat', 'Location', '_site', 'day_of_week']
X = data_df[cat_features]
y = data_df['Click']
X = X.fillna('None')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=45, stratify=y)
X.head()
| DeviceType | OSFamily | AdFormat | Location | _site | day_of_week | |
|---|---|---|---|---|---|---|
| 0 | Mobile | iOS | 300x250 | ??? | Others | Tuesday |
| 1 | PC | OSX | 728x90 | 1 | Others | Thursday |
| 2 | PC | Windows | 300x250 | ??? | Others | Wednesday |
| 3 | PC | Windows | 728x90 | ??? | Others | Thursday |
| 4 | PC | OSX | 300x250 | 3 | quizlet.com | Sunday |
model = CatBoostClassifier(iterations=100, cat_features=cat_features, learning_rate=0.1, random_seed=45)
model.fit(X_train, y_train, verbose=False, plot=False)
<catboost.core.CatBoostClassifier at 0x7ffe9aef0df0>
importances = model.feature_importances_
indices = np.argsort(importances)
features = X_train.columns
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='g', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(Pool(X, y, cat_features=cat_features))
# summarize the effects of all the features
shap.summary_plot(shap_values, X)
pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
model = CatBoostClassifier(iterations= 100,
loss_function = "Logloss",
eval_metric='Recall',
scale_pos_weight=5,
learning_rate=0.1,
verbose=False)
model.fit(
X_train,
y_train,
cat_features=cat_features,
eval_set=(X_test, y_test),
plot=True
)
<catboost.core.CatBoostClassifier at 0x7ffea2479340>
confusion_matrix(y_test, model.predict(X_test))
array([[25624, 90996],
[ 191, 31373]])
print(classification_report(y_test, model.predict(X_test)))
precision recall f1-score support
0 0.99 0.22 0.36 116620
1 0.26 0.99 0.41 31564
accuracy 0.38 148184
macro avg 0.62 0.61 0.38 148184
weighted avg 0.84 0.38 0.37 148184
model.plot_tree(
tree_idx=0,
pool=pool
)